library(here)
## here() starts at /Users/hyunhwan/Projects/InProgress/CB2-Experiments/01_gene-level-analysis
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0     ✔ purrr   0.3.0
## ✔ tibble  2.0.1     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.3.1     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(CB2)
library(CRISPhieRmix)
library(pheatmap)

Load a data file. This is intially from https://doi.org/10.1016/j.celrep.2018.06.027.

df_read <- read_tsv(here("../02_sgRNA-level-analysis/dat/Gsk3_readcount.txt"))
## Parsed with column specification:
## cols(
##   gRNA = col_character(),
##   Gene = col_character(),
##   Before_1 = col_double(),
##   Before_2 = col_double(),
##   Before_3 = col_double(),
##   Before_4 = col_double(),
##   Pos_1 = col_double(),
##   Pos_2 = col_double(),
##   Pos_3 = col_double(),
##   Pos_4 = col_double()
## )
df_count <- df_read %>% unite("id", c("Gene", "gRNA")) %>% column_to_rownames("id")
df_design <- tibble(sample_name = colnames(df_count)) %>% 
  separate("sample_name", c("group", "rep"), sep = "_", remove = F)
df_design

Run CB2

sgstat <- run_estimation(df_count, df_design, "Before", "Pos")
sgstat
genestat <- measure_gene_stats(sgstat)
genestat
summary(sgstat$p_ts)
##        V1        
##  Min.   :0.0000  
##  1st Qu.:0.2968  
##  Median :0.5393  
##  Mean   :0.5311  
##  3rd Qu.:0.7752  
##  Max.   :1.0000

Running CRISPhieRmix

library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:tidyr':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## The following object is masked from 'package:purrr':
## 
##     reduce
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
## 
##     count
## Loading required package: BiocParallel
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
## 
##     simplify
## The following objects are masked from 'package:base':
## 
##     aperm, apply
dds <- DESeqDataSetFromMatrix(df_count, df_design, ~group)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
df_ret <- results(dds) %>% as.data.frame
head(df_ret)
summary(df_ret$pvalue)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.3165  0.5734  0.5470  0.7938  1.0000    1838
df_ret$log2FoldChange[is.na(df_ret$log2FoldChange)] <- 0
df_cmix <- df_ret %>% rownames_to_column("id") %>% 
  separate("id", c("gene", "gRNA"), sep = "_", extra="merge") %>% as.data.frame
df_cmix$gene <- factor(df_cmix$gene, levels = unique(df_cmix$gene))
cmix <- CRISPhieRmix(df_cmix$log2FoldChange, df_cmix$gene)
## Loading required package: mixtools
## mixtools package, version 1.1.0, Released 2017-03-10
## This package is based upon work supported by the National Science Foundation under Grant No. SES-0518772.
## number of iterations= 36
df_cmixret <- tibble(gene = cmix$genes, locfdr = cmix$locfdr, fdr = cmix$FDR)
df_merge <- genestat %>% left_join(df_cmixret, by="gene")
## Warning: Column `gene` joining character vector and factor, coercing into
## character vector
df_merge %>% ggplot(aes(x=-log10(fdr_ts), y=-log10(fdr))) +
  geom_point() + xlab("CB2") + ylab("CRISPhieRmix")

As we have seen below, CB2 detect 205 genes as hits, and CRIPSPhieRmix detect 3,426 genes.

print(sum(genestat$fdr_ts<0.1))
## [1] 205
print(sum(df_cmixret$fdr<0.1))
## [1] 3426

Only five genes were uniquely detected by CB2.

df_merge %>% filter(fdr_ts < 0.1, fdr < 0.1) %>% nrow
## [1] 200

Visualize heatmap (scaled by each row)

genes <- df_merge %>% filter(fdr < 0.1,fdr_ts > 0.1) %>% pull(gene)
print(genes %>% head)
## [1] "0610009D07Rik" "0610010F05Rik" "1110004E09Rik" "1110008J03Rik"
## [5] "1110032A03Rik" "1110059E24Rik"
print(length(genes))
## [1] 3226
df_norm <- df_count 
for(j in 1:ncol(df_norm)) {
  df_norm[,j] <- log2(df_norm[,j] / sum(df_norm[,j]) * 10^6+1)
}

df_norm %>% rownames_to_column("id") %>% separate(id, c("gene", "gRNA"), extra = "merge") %>% 
  filter(gene %in% genes) %>% 
  select(-gene) %>% 
  column_to_rownames("gRNA") -> df_heatmap

df_heatmap[rowSums(df_heatmap) > 0,] %>% 
  pheatmap(show_rownames = F, scale = "row", main = "sgRNA heatmap of hit genes uniquely detected by CRISPhieRmix")

genes <- df_merge %>% filter(fdr < 0.1,fdr_ts < 0.1) %>% pull(gene)
print(genes %>% head)
## [1] "1110008L16Rik" "1110037F02Rik" "1810043H04Rik" "Aars2"        
## [5] "Aco2"          "Adck4"
print(length(genes))
## [1] 200
df_norm <- df_count 
for(j in 1:ncol(df_norm)) {
  df_norm[,j] <- log2(df_norm[,j] / sum(df_norm[,j]) * 10^6+1)
}

df_norm %>% rownames_to_column("id") %>% separate(id, c("gene", "gRNA"), extra = "merge") %>% 
  filter(gene %in% genes) %>% 
  select(-gene) %>% 
  column_to_rownames("gRNA") -> df_heatmap

df_heatmap[rowSums(df_heatmap) > 0,] %>% 
  pheatmap(show_rownames = F, scale = "row", main = "sgRNA heatmap of hit genes detected by both.")

Visualize heatmap (no-scaling)

genes <- df_merge %>% filter(fdr < 0.1,fdr_ts > 0.1) %>% pull(gene)
print(genes %>% head)
## [1] "0610009D07Rik" "0610010F05Rik" "1110004E09Rik" "1110008J03Rik"
## [5] "1110032A03Rik" "1110059E24Rik"
print(length(genes))
## [1] 3226
df_norm <- df_count 
for(j in 1:ncol(df_norm)) {
  df_norm[,j] <- log2(df_norm[,j] / sum(df_norm[,j]) * 10^6+1)
}

df_norm %>% rownames_to_column("id") %>% separate(id, c("gene", "gRNA"), extra = "merge") %>% 
  filter(gene %in% genes) %>% 
  select(-gene) %>% 
  column_to_rownames("gRNA") -> df_heatmap

df_heatmap[rowSums(df_heatmap) > 0,] %>% 
  pheatmap(show_rownames = F, scale = "none",  main = "sgRNA heatmap of hit genes uniquely detected by CRISPhieRmix")

genes <- df_merge %>% filter(fdr < 0.1,fdr_ts < 0.1) %>% pull(gene)
print(genes %>% head)
## [1] "1110008L16Rik" "1110037F02Rik" "1810043H04Rik" "Aars2"        
## [5] "Aco2"          "Adck4"
print(length(genes))
## [1] 200
df_norm <- df_count 
for(j in 1:ncol(df_norm)) {
  df_norm[,j] <- log2(df_norm[,j] / sum(df_norm[,j]) * 10^6+1)
}

df_norm %>% rownames_to_column("id") %>% separate(id, c("gene", "gRNA"), extra = "merge") %>% 
  filter(gene %in% genes) %>% 
  select(-gene) %>% 
  column_to_rownames("gRNA") -> df_heatmap

df_heatmap[rowSums(df_heatmap) > 0,] %>% 
  pheatmap(show_rownames = F, scale = "none", main = "sgRNA heatmap of hit genes detected by both.")